Data Analysis of Aviation Safety Occurance

Data Analysis Project Final Phase: Answering Questions

Amirabbas Jalali - Mina Moosavifar


در این پروژه رخدادهای امنیت پروازها از سال ۱۹۱۹ تا کنون را بررسی می کنیم.


توضیحات نحوه ی جمع آوری داده ها و پاکسازی آن در فایل report.pdf آمده است. همچنین در فاز پیشین تحلیل مکاشفه ای بر روی داده ها صورت گرفته و به سوالات ۱ و ۶ و ۷ پاسخ داده شده بود.
داده های ذخیره شده از طریق لینک زیر قابل دسترسی است:
https://github.com/Ajal88/DA_Project
هم چنین گزارش پروژه از طریق لینک زیر نیز قابل مشاهده است:
https://ajal88.github.io/DA_Project/Report_Phase2.html


کتابخانه های مورد نیاز و بارگذاری داده ها

library(readr)
library(dplyr)
library(stringr)
library(highcharter)
library(ggplot2)
library(stringr)
library(topicmodels)
library(tidytext)

casn <- readr::read_csv("Data/asn_c.csv") %>% as.data.frame(stringsAsFactors = F) %>% 
  mutate(Total_occupants = ifelse(Total_occupants == 0 & Total_fatalities != 0, Total_fatalities, Total_occupants),
         Total_survivors = abs(Total_occupants - Total_fatalities)) %>% 
  mutate(Total_survivors = ifelse(Total_survivors > Total_occupants, Total_occupants, abs(Total_occupants - Total_fatalities))) %>% 
  mutate(is_army = str_detect(Operator, regex("Force|Navy",ignore_case = T))) %>% 
  mutate(occ_no = row_number())

۱. آنالیز متن و دسته بندی علت وقوع مشکلات پرواز ها

برای این منظور می خواهیم از مدل lda استفاده کنیم. به همین دلیل ابتدا از داده های اصلی تنها شماره ی تصادف و Narrative را انتخاب می کنیم. سپس علت سقوط را به کلمات آن تبدیل می کنیم و stopwords را از آن حذف می کنیم. سپس کلمات رایج در سقوط هواپیما همچون هواپیما و پرواز را از کلمات حذف می کنیم. در نهایت نیز تعداد تکرار هر لغت را برای هر Narrative بدست می آوریم. سپس از آنجایی که LDA با DocumentTermMatrix کار می کند، ساختار داده ی خود را به این صورت تغییر می دهیم. در نهایت نیز مدل خود را با ۲۰ topic لرن می کنیم. از آنجایی که لرن مدل وقت گیر است، مدل را برای استفاده ی آینده ذخیره می کنیم.

cause <- casn %>% select(occ_no, Narrative)

# split into words
cause_word <- cause %>%
  unnest_tokens(output = word, input = Narrative)

# plane stopwords
word = c("aircraft", "airplane", "plane", "flight")
plane_stop_words = data_frame(word)

word_counts <- cause_word %>%
  anti_join(stop_words) %>%
  anti_join(plane_stop_words) %>%
  count(occ_no, word, sort = TRUE)

flight_dtm <- word_counts %>%
  cast_dtm(occ_no, word, n)
accident_lda <- LDA(flight_dtm, k = 20, control = list(seed = 1234))
saveRDS(accident_lda, file="Data/lda.rds")
accident_lda = readRDS(file="Data/lda.rds")

سپس برای هر topic پنج کلمه ای که بیشترین احتمال حضور در این موضوع دارد را نمایش می دهیم.

accident_topics <- tidy(accident_lda, matrix = "beta")

top_terms <- accident_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free", nrow = 5) +
  coord_flip()

top_terms_merge <- top_terms %>% group_by(topic) %>% summarise(words = paste(term, collapse=", "))

knitr::kable(top_terms_merge)
topic words
1 landing, gear, main, collapsed, undercarriage
2 sea, missing, water, hit, hangar
3 runway, feet, short, left, rest
4 crash, terrain, lake, airstrip, pilot
5 engine, fuel, wing, left, takeoff
6 pilot, crew, test, cabin, system
7 hijacker, demanded, hijackers, passengers, 1
8 runway, captain, feet, approach, pilot
9 damage, airport, sustained, substantial, accident
10 crashed, mountain, 2, km, antonov
11 accident, destroyed, killed, airport, crew
12 fire, caught, ground, destroyed, de
13 feet, departed, crew, reported, cleared
14 engine, takeoff, landing, forced, lost
15 damaged, repair, accident, reportedly, raf
16 airport, boeing, international, air, otter
17 dc, 3, struck, 4, douglas
18 en, route, night, operation, NA
19 air, force, transport, base, flying
20 approach, weather, pilot, conditions, visibility

۲. رتبه بندی علت وقوع مشکلات برای پرواز ها

برای این بخش احتمال هر تاپیک برای هر سند را بدست می آوریم. (gamma) سپس تاپیکی که بیشترین احتمال را داراست به عنوان موضوع سند انتخاب می کنیم. سپس پروازها را بر اساس تاپیک دسته بندی کردی و تعداد رخدادهای امنیتی هر تاپیک را بدست می آوریم. نمودار علت سقوط پروازها بر اساس تعداد تکرار به صورت زیر است:

accidents_gamma <- tidy(accident_lda, matrix = "gamma")

accidents_data <- accidents_gamma %>% group_by(document) %>% 
  top_n(1, gamma) %>%
  ungroup()

accident_summary <- accidents_data %>% group_by(topic) %>% summarise(count = n()) %>% 
  arrange(desc(count))

accident_summary <- accident_summary %>% inner_join(top_terms_merge, by = c("topic"))

occurance_sum = sum(accident_summary$count)
accident_summary <- accident_summary %>% mutate(count_percent = 100*count/occurance_sum)

knitr::kable(accident_summary %>% select(-count_percent))
topic count words
10 2169 crashed, mountain, 2, km, antonov
15 1816 damaged, repair, accident, reportedly, raf
14 1672 engine, takeoff, landing, forced, lost
19 1296 air, force, transport, base, flying
9 1212 damage, airport, sustained, substantial, accident
7 1206 hijacker, demanded, hijackers, passengers, 1
3 1184 runway, feet, short, left, rest
11 1122 accident, destroyed, killed, airport, crew
20 1024 approach, weather, pilot, conditions, visibility
2 986 sea, missing, water, hit, hangar
18 893 en, route, night, operation, NA
1 875 landing, gear, main, collapsed, undercarriage
13 862 feet, departed, crew, reported, cleared
12 741 fire, caught, ground, destroyed, de
5 673 engine, fuel, wing, left, takeoff
8 667 runway, captain, feet, approach, pilot
6 640 pilot, crew, test, cabin, system
4 636 crash, terrain, lake, airstrip, pilot
17 633 dc, 3, struck, 4, douglas
16 570 airport, boeing, international, air, otter
accident_summary %>% arrange(topic) %>% 
  hchart(type = "pie", hcaes(x = words ,y = count_percent)) %>% 
  hc_yAxis(title = list(text = "Count")) %>% 
  hc_xAxis(title = list(text = "Topic")) %>% 
  hc_title(text = "Airsafety Occurance Based on Topic", style = list(fontWeight = "bold")) %>% 
  hc_add_theme(hc_theme_538())

۳. آیا در صورت سقوط پرواز یک ایرلاین، دیگر نباید با آن پرواز کنیم؟

برای این سوال از داده های حاوی میزان مسافت پرواز و ظرفیت هواپیما بهره بردیم و شاخص ASK را برابر میزان صندلی دردسترس کلیومتر قرار دادیم ایم وشاخص یک میلیون برابر را اعمال کرده ایم. همانطور که از شکل و نتیجه ی تست برای میزان مرگ و میر بدست آمده نمی توان از میزان کشته های یک شرکت هواپیمایی در سوانح هوایی آن پیش بینی ای برای سال های آینده ی آن انجام داد و نمی توان دلیلی برای سفرنکردن با آن شرکت درنظر گرفت اما حوادث به صورت کلی رابطه دارد با میزان حوادثی که در گذشته ی آن شرکت رخ داده، واین رابطه رابطه ایست مستقیم که از نتیجه ای تست و نمودار نیز مشخص است و می توان نتیجه گرفت که شرکتی که حوادث بیشتری را تجربه کرده است در آینده نیز حوادث بیشتری را تجربه خواهد کرد و سفر با آن شرکت هواپیمایی ریسک بالاتری خواهد داشت. می توان این اتفاق را به سیاست گذاری و مدیریت هواپیمایی مربوط دانست.

airline_safety = read_csv("Data/airline_safety.csv") %>% 
  mutate(death_ask_85_99 = (fatalities_85_99/avail_seat_km_per_week)*10^6,
         death_ask_00_14 = (fatalities_00_14/avail_seat_km_per_week)*10^6)

ggplot(airline_safety, aes(x = death_ask_85_99, y = death_ask_00_14)) +
  geom_point() + 
  geom_text(aes(label=ifelse(death_ask_00_14>0.4 | death_ask_85_99>0.4,as.character(airline),'')),
            hjust=-0.1,
            vjust=-0.1,
            angle=10,
            size=3) +
  scale_x_continuous(name="Death in 1985 - 1999 ASK", limits=c(0, 1.2)) +
  scale_y_continuous(name="Death in 2000 - 2014 ASK", limits=c(0, 1)) + 
  geom_smooth(method = lm, se = FALSE) + 
  ggtitle("Fatalities of Companies")

cor.test(airline_safety$fatalities_85_99, airline_safety$fatalities_00_14)

    Pearson's product-moment correlation

data:  airline_safety$fatalities_85_99 and airline_safety$fatalities_00_14
t = 0.3456, df = 54, p-value = 0.731
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.2186217  0.3060989
sample estimates:
       cor 
0.04697895 
ggplot(airline_safety, aes(x = incidents_85_99, y = incidents_00_14)) +
  geom_point() + 
  geom_text(aes(label=ifelse(incidents_00_14>10 | incidents_85_99>20,as.character(airline),'')),
            hjust=0,
            vjust=0,
            angle=10,
            size=3) +
  scale_x_continuous(name="Incidents in 1985 - 1999 ASK") +
  scale_y_continuous(name="Incidents in 2000 - 2014 ASK") + 
  geom_smooth(method = lm, se = FALSE) + 
  ggtitle("Incidents of Companies")

cor.test(airline_safety$incidents_85_99, airline_safety$incidents_00_14)

    Pearson's product-moment correlation

data:  airline_safety$incidents_85_99 and airline_safety$incidents_00_14
t = 3.2359, df = 54, p-value = 0.002073
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1567121 0.6021146
sample estimates:
      cor 
0.4030088 

۴. آیا واقعا سن هواپیما در میزان تلفات آن موثر است؟

برای بررسی این موضوع بایستی سن یک هواپیما و نرخ تلفات آن را محسابه کنیم و از آزمون فرض cor.test بهره می بریم تا رابطه ی این دو متغیر را بیابیم.
همانطور که در نتیجه ی این آزمون مشهود است کمتربودن سن یک هواپیما هیچ تاثیری در کمتربودن نرخ کشته شدگان آن حادثه ی هواپیمایی ندارد و این نسبت اندکی به سمت رابطه ی عکس است اما دلیل محکمی بر وارونه بودن این فرض نیست، به طور کلی می توان از نتیجه این برداشت را داشت که هواپیماهایی که از کیفیت مناسبی برخوردار نیستند، زودتر نابود شده و سنین بالا را نمی بینند و از این رو هواپیماهایی که سن زیادی دارند در اولین حادثه ی منجر به تلفات کم نیز کاملا از خطوط هوایی خارج می شوند. این تفسیر و تحلیل این نتیجه را تایید می کند که منطقی نیز هست و داده های کاملا آن را تصدیق می کنند.

age_death = read_csv("Data/asn_c.csv") %>% as.data.frame(stringsAsFactors = F) %>% 
  mutate(Total_occupants = ifelse(Total_occupants == 0 & Total_fatalities != 0, Total_fatalities, Total_occupants),
         Total_survivors = abs(Total_occupants - Total_fatalities)) %>% 
  mutate(Total_survivors = ifelse(Total_survivors > Total_occupants, Total_occupants, abs(Total_occupants - Total_fatalities))) %>% 
  mutate(is_army = str_detect(Operator, regex("Force|Navy",ignore_case = T))) %>% 
  mutate(occ_no = row_number()) %>% 
  select(C.n.msn, Date, FirstFlight, Total_occupants, Total_fatalities, TotalAirframeHrs) %>% 
  na.omit() %>% 
  mutate(age = (Date - FirstFlight),  death_rate = (Total_fatalities/Total_occupants)) %>% 
  group_by(C.n.msn) %>% 
  summarise(date = max(Date), first_flight = min(FirstFlight), age = max(age), total_airframe = max(TotalAirframeHrs), total_occupants = max(Total_occupants), total_fatalities = max(Total_fatalities)) %>% 
  mutate(death_rate = (total_fatalities/total_occupants)*100) %>% 
  filter(!is.nan(death_rate))

cor.test(age_death$age, age_death$death_rate)

    Pearson's product-moment correlation

data:  age_death$age and age_death$death_rate
t = -7.0953, df = 2760, p-value = 1.634e-12
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.17028842 -0.09702938
sample estimates:
       cor 
-0.1338417 

۵. چه ویژگی هایی از ایرلاین در رخدادهای امنیتی آن تاثیر دارد؟(نظیر قدمت و …)

در این سوال قصد داریم به بررسی قدمت هواپیمایی، میانگین سن هواپیما های دچار سانحه شده، میزان زمان پرواز، میانگین تعداد خدمه و تعداد حادثه های هر شرکت هواپیمایی و ارتباط آن با میزان نرخ کشنده بودن کلی و میانگین نرخ مرگ در سوانح آن ها بپردازیم.

library(corrplot)

company_attr = read_csv("Data/asn_c.csv") %>% as.data.frame(stringsAsFactors = F) %>% 
  mutate(Total_occupants = ifelse(Total_occupants == 0 & Total_fatalities != 0, Total_fatalities, Total_occupants),
         Total_survivors = abs(Total_occupants - Total_fatalities)) %>% 
  mutate(Total_survivors = ifelse(Total_survivors > Total_occupants, Total_occupants, abs(Total_occupants - Total_fatalities))) %>% 
  mutate(is_army = str_detect(Operator, regex("Force|Navy",ignore_case = T))) %>% 
  mutate(occ_no = row_number()) %>% 
  filter(is_army == FALSE) %>% 
  select(C.n.msn,
         Date,
         Operator,
         FirstFlight,
         Total_occupants,
         Total_fatalities,
         TotalAirframeHrs,
         Crew_occupants) %>% 
  na.omit() %>% 
  mutate(airplane_age = (Date - FirstFlight),
         death_rate = (Total_fatalities/Total_occupants)*100) %>% 
  filter(!is.nan(death_rate)) %>% 
  group_by(Operator) %>% 
  summarise(last_event = max(Date),
            first_event = min(FirstFlight),
            mean_age_plane = mean(airplane_age),
            total_airframe = sum(TotalAirframeHrs),
            mean_airframe = mean(TotalAirframeHrs),
            total_occupants = sum(Total_occupants),
            mean_occupants = mean(Total_occupants),
            total_fatalities = sum(Total_fatalities),
            mean_fatalities = mean(Total_fatalities),
            total_crew = sum(Crew_occupants),
            mean_crew = mean(Crew_occupants),
            death_rate_avg = mean(death_rate),
            event_count = n()) %>% 
  mutate(death_rate_company = (total_fatalities/total_occupants)*100,
         company_age = last_event - first_event) %>% 
  filter(!is.nan(death_rate_company))
# total death rate
cor.test(company_attr$company_age, company_attr$death_rate_company)

    Pearson's product-moment correlation

data:  company_attr$company_age and company_attr$death_rate_company
t = -5.0208, df = 1476, p-value = 5.772e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.17938723 -0.07911754
sample estimates:
       cor 
-0.1295836 
cor.test(company_attr$mean_age_plane, company_attr$death_rate_company)

    Pearson's product-moment correlation

data:  company_attr$mean_age_plane and company_attr$death_rate_company
t = -4.6472, df = 1476, p-value = 3.665e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.17003347 -0.06952257
sample estimates:
       cor 
-0.1200857 

همانطور که مشاهده می شود قدمت شرکت و میانگین سن هواپیما های آن با نرخ مرگ ومیر کلی شرکت رابطه ی عکس دارد که برای قدمت شرکت می توان افزایش تجربه ی آن ها را عامل دانست و برای سن هواپیما نیز استدلالی همچون سوال قبل به کار برد.

cor.test(company_attr$total_airframe, company_attr$death_rate_company)

    Pearson's product-moment correlation

data:  company_attr$total_airframe and company_attr$death_rate_company
t = -1.3747, df = 1476, p-value = 0.1694
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.08659116  0.01525649
sample estimates:
        cor 
-0.03576019 
cor.test(company_attr$mean_crew, company_attr$death_rate_company)

    Pearson's product-moment correlation

data:  company_attr$mean_crew and company_attr$death_rate_company
t = 1.4797, df = 1476, p-value = 0.1392
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.01252644  0.08930061
sample estimates:
     cor 
0.038487 
cor.test(company_attr$event_count, company_attr$death_rate_company)

    Pearson's product-moment correlation

data:  company_attr$event_count and company_attr$death_rate_company
t = 0.059144, df = 1476, p-value = 0.9528
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.04945328  0.05252419
sample estimates:
        cor 
0.001539456 

همانطور که مشاهده می شود میزان پرواز و میانگین تعداد خدمه و تعداد حوادث یک شرکت هواپیمایی ارتباطی با نرخ مرگ ومیر کلی آن ندارد.

# mean death rate per airplane
cor.test(company_attr$company_age, company_attr$death_rate_avg)

    Pearson's product-moment correlation

data:  company_attr$company_age and company_attr$death_rate_avg
t = -4.3333, df = 1476, p-value = 1.568e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.16214405 -0.06144413
sample estimates:
       cor 
-0.1120818 
cor.test(company_attr$mean_age_plane, company_attr$death_rate_avg)

    Pearson's product-moment correlation

data:  company_attr$mean_age_plane and company_attr$death_rate_avg
t = -5.0192, df = 1476, p-value = 5.818e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.17934836 -0.07907763
sample estimates:
       cor 
-0.1295442 

همانطور که مشاهده می شود قدمت شرکت و میانگین سن هواپیما های آن با نرخ مرگ ومیر میانگین هر سانحه ی آن شرکت رابطه ی عکس دارد که برای قدمت شرکت می توان افزایش تجربه ی آن ها را عامل دانست و برای سن هواپیما نیز استدلالی همچون سوال قبل به کار برد.

cor.test(company_attr$total_airframe, company_attr$death_rate_avg)

    Pearson's product-moment correlation

data:  company_attr$total_airframe and company_attr$death_rate_avg
t = -0.56442, df = 1476, p-value = 0.5726
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.06562941  0.03632635
sample estimates:
        cor 
-0.01468971 
cor.test(company_attr$mean_crew, company_attr$death_rate_avg)

    Pearson's product-moment correlation

data:  company_attr$mean_crew and company_attr$death_rate_avg
t = 1.9059, df = 1476, p-value = 0.05686
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.001445471  0.100282549
sample estimates:
       cor 
0.04954704 
cor.test(company_attr$event_count, company_attr$death_rate_avg)

    Pearson's product-moment correlation

data:  company_attr$event_count and company_attr$death_rate_avg
t = 0.56855, df = 1476, p-value = 0.5697
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.03621909  0.06573636
sample estimates:
      cor 
0.0147971 

همانطور که مشاهده می شود میزان پرواز و میانگین تعداد خدمه و تعداد حوادث یک شرکت هواپیمایی ارتباطی با نرخ مرگ ومیر میانگین هر سانحه ی آن شرکت ندارد.

num_com_attr = company_attr %>% 
  select(-Operator)
cor_num_com_attr = cor(num_com_attr)
corrplot(cor_num_com_attr, method = "square", type = "lower", tl.col = "black", tl.srt = 10)

نمودار همبستگی متغیرهای موجود را در بالا مشاهده می کنید.


۶. بدترین خطوط هوایی، بدترین پروازها، بدترین هواپیماها

بدترین خطوط هوایی، بدترین هواپیماها و بدترین فرودگاه ها
ابتدا معیار بد بودن را انتخاب می کنیم، از آنجایی که شرکت هایی که تعداد پایینی پرواز و یا پروازهای کوچکی داشته باشند، در صورت سقوط دارای نرخ پایین زنده ماندن هستند اما در واقع حادثه بزرگی به شمار نمی آیند، تنها خطوط هوایی، هواپیماها و فرودگاه هایی را انتخاب می کنیم که بالای ۵۰۰ نفر مسافر داشته اند. سپس معیار بد بودن را نرخ پایین زنده ماندن در نظر میگیریم.

# worst airline
worst_airline <- casn %>% filter(!is.na(Operator)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(Operator) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Total_occupants > 500) %>% 
  top_n(20, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_airline, mapping = aes(x = reorder(Operator, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") + scale_fill_gradient(low="brown1", high="brown4") + 
  ggtitle("Worst Airlines with lowest survival rate") + 
  xlab("Airline") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p

# worst airplane
worst_airplane <- casn %>% filter(!is.na(Type)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(Type) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Total_occupants > 500) %>% 
  top_n(20, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_airplane, mapping = aes(x = reorder(Type, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") +
  ggtitle("Worst Airplanes with lowest survival rate") + 
  xlab("Airplane") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p

# worst route
worst_departure_airport <- casn %>% filter(!is.na(DepartureAirport)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(DepartureAirport) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Total_occupants > 500) %>% 
  top_n(20, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_departure_airport, mapping = aes(x = reorder(DepartureAirport, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") + scale_fill_gradient(low="midnightblue", high="darkred") +
  ggtitle("Worst Departure Airports with lowest survival rate") + 
  xlab("Departure Airport") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p


۷. سالانه چندین تصادف هوایی رخ می دهد؟ چند نفر سوار پرواز بوده اند؟ چند نفر جان سالم به در برده و چند نفر فوت کرده است؟

بررسی روند تلفات رخدادهای امنیتی پروازها در طول سالیان
برای این منظور ابتدا داده ها را بر اساس سال گروه بندی می کنیم، سپس تعداد تلفات، بازماندگان، افراد درگیر در حادثه و نرخ زنده ماندن را بدست می آوریم.(برای سال ۱۹۲۱ داده ی مناسبی به وجود نداشت به همین علت این سال از داده ها حذف شده است.)

# army and civil flights
year_fat <- casn %>% filter(!is.na(Date)) %>% group_by(Date) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants)

# remove bad data
year_fat <- year_fat[-c(3),]

highchart() %>% 
  hc_add_series(data = year_fat, type = "spline", hcaes(x = Date, y = Total_fatalities), name = "Total Fatalities") %>% 
  hc_add_series(data = year_fat, type = "spline", hcaes(x = Date, y = Total_survivors), name = "Total Survivors") %>% 
  hc_yAxis(title = list(text = "Count")) %>% 
  hc_xAxis(title = list(text = "Year")) %>% 
  hc_title(text = "Fatalities Per Year", style = list(fontWeight = "bold")) %>%
  hc_add_theme(hc_theme_flat())

همانطور که مشاهده می کنیم، تلفات حوادث در حال کاهش است. البته باید دقت داشته باشیم که این کاهش هم چنین نشانگر این است که استاندارد پرواز ها بالاتر رفته است. زیرا هر چه سال جلوتر می روند، تکنولوژی نیز پیشرفت کرده و تعداد مسافران هواپیماها افزایش یافته و استفاده از سفر هوایی بیشتر می شود. پس تعداد مسافرین بیشتر شده و تعداد کشتگان کمتر می شود که نشان دهنده ی بهبود وضعیت است.

year_fat  %>% 
  hchart(type = "spline", hcaes(x = Date, y = Survival_rate), name = "Survival Rate") %>% 
  hc_yAxis(title = list(text = "Survival Rate")) %>% 
  hc_xAxis(title = list(text = "Year")) %>% 
  hc_title(text = "Survival Rate Per Year", style = list(fontWeight = "bold")) %>%
  hc_add_theme(hc_theme_sandsignika())

با توجه به نمودارهای بالا همانطور که انتظار داشتیم، نرخ زنده ماندن تقریبا به صورت خطی بیشتر شده است.


۸. در هر کدام از دسته های علت وقوع مشکلات پروازها، چه هواپیماهایی دچار مشکل شدند و چند کشته برجای گذاشته اند؟

ابتدا دسته بندی حاصل از lda را با داده ی پروازها ادغام می کنیم و موضوعات هر رخداد هوایی را بدست می آوریم. سپس میزان تلفات و نرخ فوت را نمایش می دهیم.

casn <- casn %>% mutate(occ_no = as.character(occ_no))

casn_topics <- casn %>% inner_join(accidents_data, by = c("occ_no" = "document"))

casn_topics_sum <- casn_topics %>% group_by(topic) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors)) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic"))


casn_topics_sum %>% arrange(desc(Fatality_rate)) %>% 
  hchart(type = "column", hcaes(x = words ,y = Fatality_rate, color = Total_occupants)) %>% 
  hc_yAxis(title = list(text = "Fatality Rate")) %>% 
  hc_xAxis(title = list(text = "Topic")) %>% 
  hc_title(text = "Safety Occurence Topics Fatality Rate", style = list(fontWeight = "bold")) %>% 
  hc_add_theme(hc_theme_google())
casn_topics_yearly <- casn_topics %>% group_by(topic, Date) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors)) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate))

casn_topics_yearly %>% arrange(Date) %>% 
hchart("heatmap", hcaes(x = Date, y = words,value = Fatality_rate)) %>% 
  hc_title(text = "Safety Occurence Topics Fatality Rate in Years", style = list(fontWeight = "bold"))

در این بخش نشان می دهیم که چه حوادثی برای هواپیماها بیشتر رخ می دهد. برای این منظور بر اساس نوع هواپیما و تاپیک گروه بندی کردی و تعداد و مجموع بازماندگان و جانباختگان را بدست می آوریم. سپس هواپیماهایی را انتخاب می کنیم که بیشتر از ۲۵۰ نفر کشته داشته باشند.

casn_topic_air <- casn_topics %>% group_by(Type, topic) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors),
            occ = n()) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate)) %>% 
  filter(Total_fatalities > 250) %>% 
  ungroup() %>% 
  group_by(Type) %>% 
  arrange(desc(occ)) %>% 
  slice(1) %>% 
  select(Aircraft = Type, Cause = words, occurance = occ, Fatality_rate, Total_occupants, Total_fatalities) %>% 
  arrange(desc(Total_fatalities))

knitr::kable(casn_topic_air)
Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
Boeing 747SR-46 air, force, transport, base, flying 1 99.23664 524 520
Tupolev 154M runway, captain, feet, approach, pilot 5 76.47975 642 491
Tupolev 154B-2 runway, captain, feet, approach, pilot 4 71.53846 650 465
Antonov 24B accident, destroyed, killed, airport, crew 13 81.60470 511 417
McDonnell Douglas DC-10-10 engine, fuel, wing, left, takeoff 4 40.89457 939 384
Airbus A321-231 accident, destroyed, killed, airport, crew 2 100.00000 376 376
Ilyushin 18V accident, destroyed, killed, airport, crew 7 78.82600 477 376
Douglas DC-6 feet, departed, crew, reported, cleared 11 90.83969 393 357
Boeing 747-121 airport, boeing, international, air, otter 2 45.80645 775 355
Tupolev 104B accident, destroyed, killed, airport, crew 7 56.78808 604 343
Tupolev 134A en, route, night, operation, NA 6 85.24173 393 335
Boeing 747-237B airport, boeing, international, air, otter 3 33.23232 990 329
Boeing 747-168B feet, departed, crew, reported, cleared 1 100.00000 312 312
McDonnell Douglas DC-9-32 feet, departed, crew, reported, cleared 6 76.00000 400 304
Lockheed L-1011 TriStar 200 pilot, crew, test, cabin, system 1 100.00000 301 301
Boeing 777-2H6ER feet, departed, crew, reported, cleared 1 100.00000 298 298
Airbus A300B2-203 hijacker, demanded, hijackers, passengers, 1 4 58.46774 496 290
Lisunov Li-2 en, route, night, operation, NA 45 64.94382 445 289
Ilyushin 12 en, route, night, operation, NA 25 66.51270 433 288
Lockheed C-130H Hercules accident, destroyed, killed, airport, crew 9 71.28463 397 283
Douglas DC-6B feet, departed, crew, reported, cleared 9 87.85047 321 282
Ilyushin 76MD hijacker, demanded, hijackers, passengers, 1 2 100.00000 275 275
Boeing 747-230B feet, departed, crew, reported, cleared 1 100.00000 269 269
Airbus A300B4-622R runway, captain, feet, approach, pilot 1 97.41697 271 264
Airbus A300B4-605R runway, captain, feet, approach, pilot 2 61.46572 423 260
Boeing 747-121A pilot, crew, test, cabin, system 1 100.00000 259 259
Ilyushin Il-76TD air, force, transport, base, flying 1 100.00000 257 257
McDonnell Douglas DC-10-30 feet, departed, crew, reported, cleared 3 54.91453 468 257
McDonnell Douglas DC-8-63CF feet, departed, crew, reported, cleared 1 100.00000 256 256
Ilyushin 14P en, route, night, operation, NA 12 84.38538 301 254

در این بخش نشان می دهیم که برای هر تاپیک، چه هواپیماهایی بیشتر درگیر این رخداد می شوند. برای این منظور بر اساس تاپیک و نوع هواپیما گروه بندی کرده و تعداد رخداد و مجموع بازماندگان و جانباختگان را محاسبه می کنیم. در نهایت برای هر تاپیک ۵ هواپیمایی که بیشتر این تاپیک برایشان رخ داده است را نمایش می دهیم.

casn_topic_air <- casn_topics %>% group_by(topic, Type) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors),
            occ = n()) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate)) %>% 
  ungroup() %>% 
  filter(Survival_rate < 100) %>% 
  group_by(topic) %>% 
  arrange(desc(occ)) %>% 
  slice(1:5) %>% 
  select(Aircraft = Type, Cause = words, occurance = occ, Fatality_rate, Total_occupants, Total_fatalities) %>% 
  arrange(desc(Total_fatalities))

knitr::kable(casn_topic_air %>% filter(topic == 1))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
1 Douglas C-47-DL (DC-3) landing, gear, main, collapsed, undercarriage 14 32.432432 37 12
1 Consolidated PBY-5A Catalina landing, gear, main, collapsed, undercarriage 11 10.000000 40 4
1 Consolidated PBY-5 Catalina landing, gear, main, collapsed, undercarriage 12 5.882353 17 1
1 Douglas Dakota III (DC-3) landing, gear, main, collapsed, undercarriage 12 0.000000 0 0
1 Vickers Valetta C.1 landing, gear, main, collapsed, undercarriage 12 0.000000 0 0
knitr::kable(casn_topic_air %>% filter(topic == 2))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
2 Douglas C-47-DL (DC-3) sea, missing, water, hit, hangar 35 94.48819 127 120
2 Douglas C-47A-90-DL (DC-3) sea, missing, water, hit, hangar 19 100.00000 36 36
2 Douglas Dakota III (DC-3) sea, missing, water, hit, hangar 28 94.59459 37 35
2 Consolidated PBY-5A Catalina sea, missing, water, hit, hangar 37 71.42857 35 25
2 Consolidated PBY-5 Catalina sea, missing, water, hit, hangar 40 23.80952 21 5
knitr::kable(casn_topic_air %>% filter(topic == 3))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
3 Fokker F-27 Friendship 600 runway, feet, short, left, rest 16 29.503916 383 113
3 de Havilland Canada DHC-6 Twin Otter 300 runway, feet, short, left, rest 18 29.646018 226 67
3 Yakovlev 40 runway, feet, short, left, rest 21 14.583333 384 56
3 Avro 685 York C.1 runway, feet, short, left, rest 11 9.677419 93 9
3 Douglas Dakota IV (DC-3) runway, feet, short, left, rest 11 0.000000 0 0
knitr::kable(casn_topic_air %>% filter(topic == 4))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
4 Douglas C-47-DL (DC-3) crash, terrain, lake, airstrip, pilot 24 96.77419 124 120
4 de Havilland Canada DHC-6 Twin Otter 300 crash, terrain, lake, airstrip, pilot 14 55.88235 136 76
4 Douglas C-47A-30-DL (DC-3) crash, terrain, lake, airstrip, pilot 16 92.75362 69 64
4 Cessna 208B Grand Caravan crash, terrain, lake, airstrip, pilot 11 39.47368 76 30
4 Britten-Norman BN-2A-26 Islander crash, terrain, lake, airstrip, pilot 10 56.41026 39 22
knitr::kable(casn_topic_air %>% filter(topic == 5))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
5 Curtiss C-46F-1-CU Commando engine, fuel, wing, left, takeoff 7 27.73109 119 33
5 Douglas C-47-DL (DC-3) engine, fuel, wing, left, takeoff 11 26.89076 119 32
5 Boeing KC-135A Stratotanker engine, fuel, wing, left, takeoff 8 100.00000 18 18
5 Antonov 2R engine, fuel, wing, left, takeoff 6 28.57143 14 4
5 Consolidated PBY-5A Catalina engine, fuel, wing, left, takeoff 11 11.11111 9 1
knitr::kable(casn_topic_air %>% filter(topic == 6))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
6 McDonnell Douglas DC-9-32 pilot, crew, test, cabin, system 6 20.79395 529 110
6 Beechcraft 200 Super King Air pilot, crew, test, cabin, system 8 66.66667 33 22
6 de Havilland Canada DHC-6 Twin Otter 300 pilot, crew, test, cabin, system 7 41.46341 41 17
6 Antonov 2 pilot, crew, test, cabin, system 5 28.12500 32 9
6 Swearingen SA226-TC Metro II pilot, crew, test, cabin, system 8 13.15789 38 5
knitr::kable(casn_topic_air %>% filter(topic == 7))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
7 Boeing 727 hijacker, demanded, hijackers, passengers, 1 120 0.4461440 1569 7
7 Douglas DC-3 hijacker, demanded, hijackers, passengers, 1 27 2.1276596 235 5
7 NA hijacker, demanded, hijackers, passengers, 1 41 0.5540166 361 2
7 McDonnell Douglas DC-9 hijacker, demanded, hijackers, passengers, 1 32 0.3496503 572 2
7 McDonnell Douglas DC-8 hijacker, demanded, hijackers, passengers, 1 36 0.1322751 756 1
knitr::kable(casn_topic_air %>% filter(topic == 8))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
8 Antonov 24RV runway, captain, feet, approach, pilot 6 43.63636 165 72
8 Douglas C-47A-25-DK (DC-3) runway, captain, feet, approach, pilot 7 39.79592 98 39
8 de Havilland Canada DHC-6 Twin Otter 300 runway, captain, feet, approach, pilot 11 41.77215 79 33
8 Cessna 208B Grand Caravan runway, captain, feet, approach, pilot 7 51.35135 37 19
8 Learjet 35A runway, captain, feet, approach, pilot 10 46.87500 32 15
knitr::kable(casn_topic_air %>% filter(topic == 9))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
9 Beechcraft 200 Super King Air damage, airport, sustained, substantial, accident 46 55.10204 196 108
9 Cessna 208B Grand Caravan damage, airport, sustained, substantial, accident 74 14.25703 498 71
9 Beechcraft B200 Super King Air damage, airport, sustained, substantial, accident 25 46.95652 115 54
9 Beechcraft A100 King Air damage, airport, sustained, substantial, accident 19 30.33708 89 27
9 Consolidated C-87 Liberator Express damage, airport, sustained, substantial, accident 21 0.00000 0 0
knitr::kable(casn_topic_air %>% filter(topic == 10))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
10 Antonov 2R crashed, mountain, 2, km, antonov 164 67.17557 262 176
10 Antonov 2 crashed, mountain, 2, km, antonov 83 61.19403 268 164
10 Douglas C-47-DL (DC-3) crashed, mountain, 2, km, antonov 50 72.60274 146 106
10 Curtiss C-46D-10-CU Commando crashed, mountain, 2, km, antonov 62 95.23810 42 40
10 Curtiss C-46D-5-CU Commando crashed, mountain, 2, km, antonov 57 100.00000 6 6
knitr::kable(casn_topic_air %>% filter(topic == 11))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
11 Antonov 24B accident, destroyed, killed, airport, crew 13 81.60470 511 417
11 Antonov 26 accident, destroyed, killed, airport, crew 12 77.77778 144 112
11 Douglas C-47 (DC-3) accident, destroyed, killed, airport, crew 13 64.40678 118 76
11 Douglas C-47-DL (DC-3) accident, destroyed, killed, airport, crew 20 56.70103 97 55
11 Let L-410UVP accident, destroyed, killed, airport, crew 12 22.22222 72 16
knitr::kable(casn_topic_air %>% filter(topic == 12))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
12 de Havilland DH.86 Express fire, caught, ground, destroyed, de 7 100.00000 27 27
12 Consolidated PBY-5A Catalina fire, caught, ground, destroyed, de 13 38.59649 57 22
12 Douglas C-47-DL (DC-3) fire, caught, ground, destroyed, de 15 41.02564 39 16
12 Douglas C-47 (DC-3) fire, caught, ground, destroyed, de 8 45.83333 24 11
12 Consolidated PBY-5 Catalina fire, caught, ground, destroyed, de 7 0.00000 0 0
knitr::kable(casn_topic_air %>% filter(topic == 13))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
13 Douglas DC-6 feet, departed, crew, reported, cleared 11 90.83969 393 357
13 de Havilland Canada DHC-6 Twin Otter 300 feet, departed, crew, reported, cleared 19 86.50000 200 173
13 Douglas C-47-DL (DC-3) feet, departed, crew, reported, cleared 12 80.60606 165 133
13 Douglas C-47A-80-DL (DC-3) feet, departed, crew, reported, cleared 10 82.78689 122 101
13 Cessna 208B Grand Caravan feet, departed, crew, reported, cleared 12 75.00000 64 48
knitr::kable(casn_topic_air %>% filter(topic == 14))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
14 Douglas C-47-DL (DC-3) engine, takeoff, landing, forced, lost 34 43.25397 252 109
14 Douglas Dakota III (DC-3) engine, takeoff, landing, forced, lost 25 82.35294 34 28
14 Vickers Valetta C.1 engine, takeoff, landing, forced, lost 31 26.58228 79 21
14 Antonov 2R engine, takeoff, landing, forced, lost 30 20.00000 60 12
14 Douglas Dakota IV (DC-3) engine, takeoff, landing, forced, lost 37 17.94872 39 7
knitr::kable(casn_topic_air %>% filter(topic == 15))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
15 Douglas C-47-DL (DC-3) damaged, repair, accident, reportedly, raf 68 98.33333 60 59
15 Douglas Dakota III (DC-3) damaged, repair, accident, reportedly, raf 45 64.55696 79 51
15 Douglas Dakota IV (DC-3) damaged, repair, accident, reportedly, raf 36 77.77778 9 7
15 Douglas C-47D (DC-3) damaged, repair, accident, reportedly, raf 40 0.00000 0 0
15 Douglas C-47A-90-DL (DC-3) damaged, repair, accident, reportedly, raf 34 0.00000 0 0
knitr::kable(casn_topic_air %>% filter(topic == 16))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
16 de Havilland Canada DHC-6 Twin Otter 300 airport, boeing, international, air, otter 45 25.15337 489 123
16 de Havilland Canada DHC-6 Twin Otter 100 airport, boeing, international, air, otter 6 60.93750 64 39
16 de Havilland Canada DHC-6 Twin Otter 200 airport, boeing, international, air, otter 13 38.70968 93 36
16 Learjet 35A airport, boeing, international, air, otter 9 50.00000 42 21
16 Beechcraft B200 Super King Air airport, boeing, international, air, otter 5 53.84615 26 14
knitr::kable(casn_topic_air %>% filter(topic == 17))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
17 Douglas C-47-DL (DC-3) dc, 3, struck, 4, douglas 26 59.53079 341 203
17 Douglas C-47A-90-DL (DC-3) dc, 3, struck, 4, douglas 14 82.91139 158 131
17 Douglas C-47 (DC-3) dc, 3, struck, 4, douglas 13 80.34188 117 94
17 Douglas DC-3 dc, 3, struck, 4, douglas 19 61.02941 136 83
17 Douglas C-47A-25-DK (DC-3) dc, 3, struck, 4, douglas 12 92.00000 50 46
knitr::kable(casn_topic_air %>% filter(topic == 18))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
18 Lisunov Li-2 en, route, night, operation, NA 45 64.94382 445 289
18 Douglas C-47-DL (DC-3) en, route, night, operation, NA 73 70.58824 119 84
18 Douglas C-47A-80-DL (DC-3) en, route, night, operation, NA 45 58.18182 110 64
18 Douglas Dakota III (DC-3) en, route, night, operation, NA 35 45.94595 111 51
18 Douglas C-47A-90-DL (DC-3) en, route, night, operation, NA 26 63.26531 49 31
knitr::kable(casn_topic_air %>% filter(topic == 19))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
19 Junkers Ju-52/3m air, force, transport, base, flying 71 69.38776 147 102
19 Consolidated PBY-5 Catalina air, force, transport, base, flying 56 72.82609 92 67
19 Consolidated PBY-5A Catalina air, force, transport, base, flying 52 50.75758 132 67
19 Douglas C-47 (DC-3) air, force, transport, base, flying 22 90.41096 73 66
19 Douglas C-47-DL (DC-3) air, force, transport, base, flying 29 44.00000 125 55
knitr::kable(casn_topic_air %>% filter(topic == 20))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
20 Douglas Dakota IV (DC-3) approach, weather, pilot, conditions, visibility 22 99.01478 203 201
20 de Havilland Canada DHC-6 Twin Otter 300 approach, weather, pilot, conditions, visibility 18 63.39286 224 142
20 Douglas Dakota III (DC-3) approach, weather, pilot, conditions, visibility 18 90.10989 91 82
20 Antonov 2 approach, weather, pilot, conditions, visibility 14 60.78431 51 31
20 Antonov 2R approach, weather, pilot, conditions, visibility 13 33.33333 12 4

۹. آیا وقوع رخدادهایی برای یک ایرلاین باعث بهبود روند آن می شود؟

برای بررسی این روند نمودار تعداد حوادث شرکت های هواپیمایی ای که داده های مناسبی را داشتند را مورد بررسی کردیم، شرکت ّهای منتخب ما دارای تلفات قابل توجه و تعداد سال های حادثه خیز بیشتر از ۲ است. همانطور که در نمودار قابل رویت است بیشتر این شرکت ها بعد از رسیدن به یک نقطه ی اوج از سوانح هوایی آن را رفته رفته تعدیل کردند و به وضوح روند خود را بهبود بخشیده اند پس می توان این گزاره را صحیح دانست.

library(highcharter)
library(ggplot2)

company_year = read_csv("Data/asn_c.csv") %>% as.data.frame(stringsAsFactors = F) %>% 
  mutate(Total_occupants = ifelse(Total_occupants == 0 & Total_fatalities != 0, Total_fatalities, Total_occupants),
         Total_survivors = abs(Total_occupants - Total_fatalities)) %>% 
  mutate(is_army = str_detect(Operator, regex("Force|Navy",ignore_case = T))) %>% 
  filter(is_army == FALSE) %>% 
  select(C.n.msn,
         year = Date,
         Operator,
         FirstFlight,
         Total_occupants,
         Total_fatalities,
         TotalAirframeHrs,
         Crew_occupants) %>% 
  na.omit() %>% 
  group_by(Operator, year) %>% 
  summarise(count = n(), tot_fatal = sum(Total_fatalities), tot_occu = sum(Total_occupants)) %>% 
  filter(tot_occu > 20, tot_fatal > 5) %>% 
  mutate(index = 1, index = cumsum(index), index = max(index)) %>% 
  filter(index > 2) %>% 
  select(-index) %>% 
  mutate(death_rate = tot_fatal*100/tot_occu)

company_year %>% 
  hchart(type = "line",
         hcaes(x = year, y = count, group = Operator)) %>%
  hc_xAxis(title = list(text = "year")) %>%
  hc_yAxis(title = list(text = "Accidents Count"),
           max = 22,
           tickInterval = 1,
           min = 0,
           plotLines = list(list(color = "#FF0000",
                                 width = 2,
                                 value = 11,
                                 dashStyle = 'shortdash'))) %>% 
  hc_title(text = "Accidents Count of an Airline in years",
           style = list(fontWeight = "bold"))

۱۰. انتخاب ارزان ترین پروازها، به معنی ناامن بودن آنها است؟

برای حل این سوال، از داده ی هزینه ی پروازهای آمریکا استفاده می کنیم و میانگین هزینه ی هر خط هوایی را بدست می آوریم. سپس برای هر خط هوایی میزان امنیت را نیز با فرمولی که در سوال بعد به تفصیل توضیح داده خواهد شد، محاسبه می کنیم. در نهایت رابطه ی میان امنیت پروازها و قیمت آن ها را برای خطوط هوایی مهم آمریکا نمایش می دهیم.

airfare <- read_csv("Data/Consumer_Airfare_Report__Table_5_-_Detailed_Fare_Information_For_Highest_and_Lowest_Fare_Markets_Under_750_Miles.csv")

american_airlines <- airfare %>% select(car) %>% distinct(car) %>% arrange(car)
pattern_name = c("Tropic", "Air Plus Comet", "American Airlines", "Alaska Airlines", "JetBlue", "Continental Air Lines",
                 "Discovery Airways", "Delta Air Lines", "Frontier Airlines (FL)", "Tran Airways", "Allegiant Air",
                 "America West Airlines", "", "", "Kiwi Regional Airlines", "", "Spirit Airlines", "Northwest",
                 "", "", "Horizon Air", "", "", "", "Skydive Twin Cities", "", "United Airlines", "US Airways",
                 "American Virginia", "Pacific Western Airlines", "", "Southwest Airlines", "", "", "Mesa Airlines",
                 "Midwest", "Air Wisconsin")

american_airlines <- data.frame(american_airlines, pattern_name)
colnames(american_airlines ) <- c("car","Operator")

airfare <- na.omit(airfare)
airfare <- airfare %>% mutate(price = as.numeric(str_extract(mkt_fare,"\\d+")))
sum_airfare <- airfare %>% group_by(car, Year) %>% summarise(price = mean(price))
sum_airfare <- sum_airfare %>% inner_join(american_airlines, by = c("car")) %>% ungroup() %>% 
  group_by(Operator) %>% summarise(price = mean(price)) %>% 
  arrange(desc(Operator))

need = as.data.frame(c(2, 3, 0, 0, 0, 4, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 5, 3, 1, 6, 6, 0, 7, 0, 0, 0))
colnames(need) = c('need')

sum_airfare <- sum_airfare %>% bind_cols(need) %>% group_by(need) %>% summarise(price = mean(price)) %>% 
  ungroup() %>% filter(need != 0)

airlines_name = as.data.frame(c("Virgin Atlantic",
                  "Southwest Airlines",
                  "United / Continental*",
                  "Alaska Airlines*",
                  "American*",
                  "Delta / Northwest*",
                  "US Airways / America West*"))
colnames(airlines_name) =  c("airline")

need = as.data.frame(c(1,2,3,4,5,6,7))
colnames(need) = c('need')

airline_safety_total = read_csv("Data/airline_safety.csv") %>% 
  mutate(total_fatal_accidents = (fatal_accidents_85_99 + fatal_accidents_00_14),
         total_incidents = (incidents_85_99 + incidents_00_14)) %>% 
  select(airline, avail_seat_km_per_week, total_fatal_accidents, total_incidents) %>% 
  mutate(score = 20 - (9*total_fatal_accidents + total_incidents)*10^8/avail_seat_km_per_week)

airline_safety_total = merge(airline_safety_total, airlines_name, sort = FALSE) %>% 
  arrange(desc(airline)) %>% 
  bind_cols(need)

safety_price <- airline_safety_total %>% inner_join(sum_airfare, by = c("need"))

knitr::kable(safety_price %>% select(airline, price, safety = score))
airline price safety
Virgin Atlantic 97.66667 19.90052
US Airways / America West* 244.64957 15.60205
United / Continental* 227.91987 18.27714
Southwest Airlines 134.86595 19.72532
Delta / Northwest* 249.27236 17.33360
American* 159.38471 17.89609
Alaska Airlines* 94.39572 18.03180
cor.test(formula = ~price + score, data = safety_price)

    Pearson's product-moment correlation

data:  price and score
t = -2.2164, df = 5, p-value = 0.07748
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.9522264  0.1044551
sample estimates:
       cor 
-0.7039785 
safety_price_avg = mean(safety_price$price)
safety_price_high <- safety_price %>% filter(price > safety_price_avg)
safety_price_low <- safety_price %>% filter(price <= safety_price_avg)
wilcox.test(safety_price_high$score, safety_price_low$score, alternative = "less", exact = FALSE, correct = FALSE)

    Wilcoxon rank sum test

data:  safety_price_high$score and safety_price_low$score
W = 2, p-value = 0.07865
alternative hypothesis: true location shift is less than 0

همانطور که می بینیم، در تست کوریلیشن فرض استقلال قیمت پرواز و امنیت آن با دقت ۰.۱ رد می شود. هم چنین می بینیم که هزینه ی پرواز و امنیت آن همبستگی معکوس بالایی دارند و هر چه پرواز ارزانتر می شود، امنیت آن کمتر می شود. همچنین برای اطمینان ببیشتر نیز از تست wilcox rank sum test استفاده می کنیم. فرض صفر را بیشتر بودن امنیت پروازهای ارزانتر در نظر میگیریم. مشاهده می کنیم که با دقت ۰.۱ فرض صفر باطل است و پروازهای ارزانتر امنیت بیشتری از پروازهای گرانتر ندارند.


۱۱. اضافه کردن معیار امنیت به پروازها

برای این سوال به دنبال یک معیار جهت ارزیابی همه جانبه ی یک پرواز با توجه به سابقه ی هواپیمایی و سابقه ی هواپیما و همچنین یک پرواز خاص که خود شرایط خاصی همچون مسیر و مسافت دارد بودیم و فرمول زیر را به عنوان معیاری از امنیت برای یک پرواز بدست آورده ایم.
معیار امنیت = نرخ نجات در شرکت هواپیمایی در میانگین تعداد مسافران در هر پرواز + نرخ نجات در هواپیمای موردنظر در میانگین تعداد مسافران آن هواپیما + حاصل ضرب نرخ نجات در هواپیما و شرکت هواپیمایی در تعداد مسافر هواپیمای مذکور
بعد از اعمال این معیار نتیجه کسب بالاترین امتیاز توسط هواپیما های برتر در خطوط هوایی عالی در جهان شد که مناسب بودن معیار را نشان می دهد. نمونه ای از این امتیازدهی را در زیر مشاهده می کنید.

# find safety for flights
casn_flight_safety = casn_topics %>% filter(!is.na(Type) & !is.na(Operator) & Total_occupants != 0) %>% 
  group_by(Type) %>% 
  mutate(airplane_total_occupants = sum(Total_occupants), airplane_total_fatalities = sum(Total_fatalities),
         airplane_total_survivors = sum(Total_survivors), airplane_survival_rate = airplane_total_survivors/airplane_total_occupants,
         airplane_occurance = n(), airplane_mean_occupants = mean(Total_occupants)) %>% 
  ungroup() %>% 
  group_by(Operator) %>% 
  mutate(airline_total_occupants = sum(Total_occupants), airline_total_fatalities = sum(Total_fatalities),
         airline_total_survivors = sum(Total_survivors), airline_survival_rate = airline_total_survivors/airline_total_occupants,
         airline_occurance = n(), airline_mean_occupants = mean(Total_occupants)) %>% 
  ungroup() %>% 
  mutate(score = (airplane_survival_rate*airplane_mean_occupants) + (airline_survival_rate*airline_mean_occupants) + (airplane_survival_rate*airline_survival_rate*Total_occupants)) %>% 
  group_by(Operator, Type) %>% 
  summarise(flight_safety_indicator = mean(score))

knitr::kable(head(casn_flight_safety))
Operator Type flight_safety_indicator
2nd Arkhangelsk United Aviation Division Antonov 2 17.69351
2nd Sverdlovsk Aviation Enterprise Antonov 2R 21.62431
748 Air Services Antonov 12BP 19.07187
748 Air Services British Aerospace BAe-748-398 Srs. 2B 95.75000
748 Air Services Hawker Siddeley HS-748-206 Andover CC2 17.75000
748 Air Services Hawker Siddeley HS-780 Andover C.1 19.75000

۱۲. رده بندی پروازها بر اساس امنیت

از داده های سوال قبل بهره برده و در جدولی برترین پرواز های خطوط هواپیمایی را نشان می دهیم.

top_20_flight = casn_flight_safety %>% 
  ungroup() %>% 
  arrange(-flight_safety_indicator) %>% 
  slice(1:20)
knitr::kable(top_20_flight)
Operator Type flight_safety_indicator
Air Atlanta Icelandic Boeing 747-267B 1338.0000
Mahan Air Boeing 747-3B3 1089.0000
Qantas Airbus A380-842 1062.0357
All Nippon Airways - ANA Boeing 747-481D 1055.8114
Tower Air Boeing 747-136 966.0000
Air France Airbus A380-861 951.4491
Qantas Boeing 747-438 900.3825
Emirates Boeing 777-31H 887.5000
Air Transat Airbus A330-243 870.5000
Emirates Airbus A340-541 837.5000
Thomson Airways Boeing 767-324ER (WL) 810.0000
All Nippon Airways - ANA Boeing 747SR-81 776.8594
Japan Air Lines - JAL McDonnell Douglas DC-10-40 770.1272
Korean Air Boeing 747-4B5 769.7858
United Arab Emirates - Amiri Flight Airbus A300C4-620 755.7454
WestJet Boeing 767-338ER (WL) 747.0000
Ansett Australia Airlines Boeing 747-312 737.5000
Air France Boeing 747-128 730.7760
British Airways Boeing 747-436 720.3829
Qantas Airbus A330-303 716.5805

۱۳. تاثیر وقوع سانحه ی هوایی بر روی قیمت بلیت های آن ایرلاین

برای این سوال ابتدا داده ی قیمت پروازهای آمریکا را با داده ی حوادث ادغام کرده ایم. برای این منظور میانگین قیمت بلیت را به طور سالانه و برای هر ایرلاین بدست آورده و آن را برای رخدادهای امنیتی پروازهای آن ایرلاین در آن سال قرار می دهیم.

همانطور که مشاهده می کنید برای ۳ ایرلاین حادثه خیز در آمریکا اطلاعات را جمع آوری کرده و میزان قیمت و حوادث آن ها را کشیده ایم که به وضوح مشخص است هیچ ربط خاصی بین این دو مقدار وجود ندارد که منطقی نیز به نظر می رسد و هرشرکت هواپیمایی با توجه به هزینه های خود قیمت گذاری را انجام می دهد و در صورت بروز سانحه نیز دلیلی بر کاهش هزینه های آن شرکت دیده نمی شود بلکه خسارات و پیشگیری ها هزینه های بیشتری را متحمل این شرکت ها می کند که تست ارتباط نیز هیچ ارتباطی را بین این دو متغیر تایید نمی کند.

library(reshape2)
sum_airfare <- airfare %>% group_by(car, Year) %>% summarise(price = mean(price))
sum_airfare <- sum_airfare %>% inner_join(american_airlines, by = c("car")) %>% ungroup()

american_crashes <- casn_topics %>% ungroup() %>% 
  inner_join(american_airlines, by = c("Operator"))

sum_american_crashes <- american_crashes %>% group_by(Operator, Date) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors),
            occurance = n()) %>% 
  mutate(Survival_rate = ifelse(Total_occupants < 1, 0, 100*Total_survivors/Total_occupants), 
         Fatality_rate = ifelse(Total_occupants < 1, 0, 100*Total_fatalities/Total_occupants)) %>% 
  ungroup()

Total_occupants_avg <- mean(sum_american_crashes$Total_occupants)
sum_american_crashes <- sum_american_crashes %>% 
  mutate(safety = ifelse(Total_occupants < 1, 0.1, Survival_rate - Total_occupants_avg/(occurance*Total_occupants)))

crash_price <- sum_airfare %>% inner_join(sum_american_crashes, by = c("Operator", "Year" = "Date"))

crash_efect = crash_price %>% 
  select(year = Year, airline = Operator, price, occurance) %>% 
  filter(airline == "American Airlines" |
           airline == "United Airlines" |
           airline == "Delta Air Lines")

ggplot(crash_efect, aes(x = year, y = occurance , group = airline, fill = airline)) +
  geom_bar(stat="identity",position="dodge") + ggtitle("Airline Safety Occurance Yearly")

ggplot(crash_efect, aes(x = year, y = price , group = airline, fill = airline)) +
  geom_bar(stat="identity",position="dodge") + ggtitle("Airline Average Price Yearly")

cor.test(crash_efect$price, crash_efect$occurance)

    Pearson's product-moment correlation

data:  crash_efect$price and crash_efect$occurance
t = 0.23624, df = 32, p-value = 0.8148
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.3006828  0.3746055
sample estimates:
       cor 
0.04172557 

۱۴. تاثیر تحریم ها بر روی سوانح هوایی ایران و تحلیل سوالات فوق برای ایران

برای حل این سوال، خطوط هوایی ایران را یافته و داده ی مربوط به ایران را بدست می آوریم. سپس رخدادهای امنیتی را به دو دسته ی با تلفات و بدون آن تقسیم می کنیم و نمودار آن را در طول زمان می کشیم.

iran_casn <- casn_topics %>% filter(str_detect(Operator, "Iran")) %>% 
  bind_rows(casn_topics %>% filter(str_detect(Operator, "Qeshm")))

rest_Iranian_operators = as.data.frame(c("Mahan Air", "Zagros Air", "Kish Air", "Taban Air", "Caspian Airlines",
                           "Saha Air"))
colnames(rest_Iranian_operators) = c("Operator")

iran_casn <- iran_casn %>% 
  bind_rows(casn %>% inner_join(rest_Iranian_operators, by = c("Operator")))

iran_casn <- iran_casn %>% 
  mutate(kind = ifelse(Total_fatalities > 0, "Crash", "Incident"))

# a: sanctions effect
iran_sum <- iran_casn %>% group_by(Date, kind) %>% 
  summarise(count = n(), fatalities = sum(Total_fatalities))

iran_sum <- iran_casn %>% ungroup() %>% group_by(Date) %>% 
  summarise(count = n(), fatalities = sum(Total_fatalities)) %>% 
  mutate(kind = "All") %>% bind_rows(iran_sum)

iran_sum  %>% 
  hchart(type = "column", hcaes(x = Date, y = count, group = kind)) %>% 
  hc_yAxis(title = list(text = "Count")) %>% 
  hc_xAxis(title = list(text = "Year"), 
           plotLines = list(list(color = "#FF0000", width = 2, value = 1978, dashStyle = 'shortdash', 
                                 label = list(text = "Iran Revolution - Sanctions")))) %>% 
  hc_title(text = "Iran Safety Occurance", style = list(fontWeight = "bold")) %>% 
  hc_add_theme(hc_theme_flat())
iran_before <- iran_sum %>% filter(Date < 1978 & kind == "All")
iran_after <- iran_sum %>% filter(Date >= 1978 & kind == "All")
t.test(iran_before$count, iran_after$count, alternative = "less")

    Welch Two Sample t-test

data:  iran_before$count and iran_after$count
t = -2.2802, df = 47.773, p-value = 0.01355
alternative hypothesis: true difference in means is less than 0
95 percent confidence interval:
       -Inf -0.1645028
sample estimates:
mean of x mean of y 
 1.600000  2.222222 

همانطور که می بینیم، همه ی انواع رخدادهای امنیتی پس از انقلاب افزایش یافته است و تحریم ها موثر بوده است.

برای تست تاثیر تحریم ها، فرض صفر را کمتر شدن رخدادهای امنیتی پس از انقلاب در نظر می گیریم. همانطور که نتیجه نشان می دهد، این احتمال بسیار کم بوده و فرض صفر رد می شود. پس تحریم ها تاثیر داشته و رخدادهای امنیتی پس از انقلاب بسیار بیشتر از پیش از آن است.

بدترین خطوط هوایی، پروازها، فرودگاه های مقصد

# 6
# worst airline
worst_airline <- iran_casn %>% filter(!is.na(Operator)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(Operator) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Survival_rate != 0) %>% 
  top_n(5, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_airline, mapping = aes(x = reorder(Operator, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") + scale_fill_gradient(low="brown1", high="brown4") + 
  ggtitle("Worst Iranian Airlines with lowest survival rate") + 
  xlab("Airline") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p

# worst airplane
worst_airplane <- iran_casn %>% filter(!is.na(Type)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(Type) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Survival_rate != 0) %>% 
  top_n(5, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_airplane, mapping = aes(x = reorder(Type, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") +
  ggtitle("Worst Iranian Airplanes with lowest survival rate") + 
  xlab("Airplane") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p

# worst route
worst_departure_airport <- iran_casn %>% filter(!is.na(DepartureAirport)) %>% 
  filter(is_army == FALSE) %>% 
  group_by(DepartureAirport) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants) %>% 
  ungroup() %>% 
  filter(Survival_rate != 0) %>% 
  top_n(5, wt = desc(Survival_rate)) %>% 
  arrange(Survival_rate)

p = ggplot(data = worst_departure_airport, mapping = aes(x = reorder(DepartureAirport, Survival_rate), y = Survival_rate, fill = Total_fatalities)) + 
  geom_bar(stat="identity") + scale_fill_gradient(low="midnightblue", high="darkred") +
  ggtitle("Worst Iranian Departure Airports with lowest survival rate") + 
  xlab("Departure Airport") + 
  ylab("Survival rate") + guides(color=guide_legend(title="fatality"), fill=guide_legend(title="fatality")) + 
  coord_flip()
p

میزان تلفات هوایی ایران در سال ها

# 7
# army and civil flights
year_fat <- iran_casn %>% filter(!is.na(Date)) %>% group_by(Date) %>% 
  summarise(Total_occupants = sum(Total_occupants), Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors), Survival_rate = 100*Total_survivors/Total_occupants)

highchart() %>% 
  hc_add_series(data = year_fat, type = "spline", hcaes(x = Date, y = Total_fatalities), name = "Total Fatalities") %>% 
  hc_add_series(data = year_fat, type = "spline", hcaes(x = Date, y = Total_survivors), name = "Total Survivors") %>% 
  hc_yAxis(title = list(text = "Count")) %>% 
  hc_xAxis(title = list(text = "Year")) %>% 
  hc_title(text = "Iran Fatalities Per Year", style = list(fontWeight = "bold")) %>%
  hc_add_theme(hc_theme_flat())

میزان تلفات علل وقوع حوادث امنیتی در ایران

# 8
iran_casn_topics_sum <- iran_casn %>% group_by(topic) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors)) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic"))


iran_casn_topics_sum %>% arrange(desc(Fatality_rate)) %>% 
  hchart(type = "column", hcaes(x = words ,y = Fatality_rate, color = Total_occupants)) %>% 
  hc_yAxis(title = list(text = "Fatality Rate")) %>% 
  hc_xAxis(title = list(text = "Topic")) %>% 
  hc_title(text = "Iran Safety Occurence Topics Fatality Rate", style = list(fontWeight = "bold")) %>% 
  hc_add_theme(hc_theme_google())
iran_casn_topics_yearly <- iran_casn %>% group_by(topic, Date) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors)) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate))

iran_casn_topics_yearly %>% arrange(Date) %>% 
  hchart("heatmap", hcaes(x = Date, y = words,value = Fatality_rate)) %>% 
  hc_title(text = "Iran Safety Occurence Topics Fatality Rate in Years", style = list(fontWeight = "bold"))

حوادثی که برای هواپیماهای ایران بیشتر رخ می دهند

iran_casn_topic_air <- iran_casn %>% group_by(Type, topic) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors),
            occ = n()) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate)) %>% 
  ungroup() %>% 
  group_by(Type) %>% 
  arrange(desc(occ)) %>% 
  slice(1) %>% 
  select(Aircraft = Type, Cause = words, occurance = occ, Fatality_rate, Total_occupants, Total_fatalities) %>% 
  arrange(desc(Total_fatalities))

knitr::kable(iran_casn_topic_air)
Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
Airbus A300B2-203 hijacker, demanded, hijackers, passengers, 1 4 58.467742 496 290
Tupolev Tu-154M en, route, night, operation, NA 1 100.000000 131 131
Boeing 727-286 hijacker, demanded, hijackers, passengers, 1 1 74.285714 105 78
ATR 72-212 hijacker, demanded, hijackers, passengers, 1 1 100.000000 66 66
Fokker F-28 Fellowship 1000 dc, 3, struck, 4, douglas 1 100.000000 66 66
Antonov 74T-200 runway, feet, short, left, rest 1 97.368421 38 37
Lockheed C-130H Hercules sea, missing, water, hit, hangar 1 100.000000 32 32
Yakovlev 40 hijacker, demanded, hijackers, passengers, 1 1 100.000000 30 30
Tupolev 154M landing, gear, main, collapsed, undercarriage 1 18.918919 148 28
Douglas C-47A-35-DL (DC-3) accident, destroyed, killed, airport, crew 1 96.000000 25 24
Boeing 747-131F feet, departed, crew, reported, cleared 1 100.000000 17 17
Lockheed L-1329-25 JetStar II pilot, crew, test, cabin, system 1 100.000000 12 12
Douglas C-47A-30-DL (DC-3) hijacker, demanded, hijackers, passengers, 1 1 100.000000 9 9
Douglas C-47A-25-DK (DC-3) crashed, mountain, 2, km, antonov 2 100.000000 8 8
Ilyushin 76MD Simorgh engine, takeoff, landing, forced, lost 1 100.000000 7 7
Boeing 727 hijacker, demanded, hijackers, passengers, 1 6 1.582278 316 5
Fokker F-27 Friendship 200 crashed, mountain, 2, km, antonov 1 100.000000 4 4
Boeing 737-286 hijacker, demanded, hijackers, passengers, 1 1 100.000000 3 3
de Havilland Canada DHC-6 Twin Otter 300 approach, weather, pilot, conditions, visibility 1 100.000000 3 3
Douglas C-47 (DC-3) crashed, mountain, 2, km, antonov 1 100.000000 1 1
unknown jetliner hijacker, demanded, hijackers, passengers, 1 1 100.000000 1 1
Airbus A300B2-202 hijacker, demanded, hijackers, passengers, 1 1 0.000000 315 0
Boeing 707-300 hijacker, demanded, hijackers, passengers, 1 1 0.000000 0 0
Boeing 727-86 pilot, crew, test, cabin, system 1 0.000000 2 0
Boeing 747 hijacker, demanded, hijackers, passengers, 1 2 0.000000 0 0
Boeing 747SP-86 landing, gear, main, collapsed, undercarriage 1 0.000000 180 0
Dassault Falcon 20E crashed, mountain, 2, km, antonov 1 0.000000 0 0
Dassault Falcon 20F damaged, repair, accident, reportedly, raf 2 0.000000 0 0
Douglas C-47-DL (DC-3) engine, fuel, wing, left, takeoff 1 0.000000 2 0
Douglas C-47A-20-DK (DC-3) engine, takeoff, landing, forced, lost 1 0.000000 6 0
Douglas C-47A-20-DL (DC-3) engine, takeoff, landing, forced, lost 1 0.000000 0 0
Douglas C-47A-70-DL (DC-3) engine, takeoff, landing, forced, lost 1 0.000000 28 0
Douglas C-47B-20-DK (DC-3) fire, caught, ground, destroyed, de 1 0.000000 0 0
Douglas DC-4 engine, takeoff, landing, forced, lost 1 0.000000 3 0
Douglas R4D-4 (DC-3) crashed, mountain, 2, km, antonov 1 0.000000 0 0
Fokker 100 hijacker, demanded, hijackers, passengers, 1 4 0.000000 226 0
Fokker F-27 Friendship 300 hijacker, demanded, hijackers, passengers, 1 1 0.000000 0 0
Fokker F-27 Friendship 400M crashed, mountain, 2, km, antonov 1 0.000000 0 0
Fokker F-27 Friendship 600 runway, feet, short, left, rest 1 0.000000 50 0
Fokker F-28 Fellowship 4000 approach, weather, pilot, conditions, visibility 1 0.000000 88 0
Ilyushin 76MD engine, fuel, wing, left, takeoff 1 0.000000 0 0
Lockheed C-130 Hercules air, force, transport, base, flying 2 0.000000 0 0
Lockheed C-130E Hercules crashed, mountain, 2, km, antonov 2 0.000000 0 0
Lockheed EC-130E Hercules air, force, transport, base, flying 1 0.000000 0 0
Lockheed P-3F Orion en, route, night, operation, NA 1 0.000000 0 0
Vickers 782D Viscount landing, gear, main, collapsed, undercarriage 1 0.000000 0 0
NA hijacker, demanded, hijackers, passengers, 1 5 0.000000 0 0

چه هواپیماهایی در ایران بیشتر دچار هر دسته ی حوادث می شوند

iran_casn_topic_air <- iran_casn %>% group_by(topic, Type) %>% 
  summarise(Total_occupants = sum(Total_occupants), 
            Total_fatalities = sum(Total_fatalities),
            Total_survivors = sum(Total_survivors),
            occ = n()) %>% 
  mutate(Survival_rate = 100*Total_survivors/Total_occupants, Fatality_rate = 100*Total_fatalities/Total_occupants) %>% 
  inner_join(top_terms_merge, by = c("topic")) %>% 
  mutate(Survival_rate = ifelse(is.na(Survival_rate), 0, Survival_rate),
         Fatality_rate = ifelse(is.na(Fatality_rate), 0, Fatality_rate)) %>% 
  ungroup() %>% 
  filter(Survival_rate < 100) %>% 
  group_by(topic) %>% 
  arrange(desc(occ)) %>% 
  slice(1:3) %>% 
  select(Aircraft = Type, Cause = words, occurance = occ, Fatality_rate, Total_occupants, Total_fatalities) %>% 
  arrange(desc(Total_fatalities))

knitr::kable(iran_casn_topic_air %>% filter(topic == 1))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
1 Tupolev 154M landing, gear, main, collapsed, undercarriage 1 18.91892 148 28
1 Fokker 100 landing, gear, main, collapsed, undercarriage 2 0.00000 0 0
1 Vickers 782D Viscount landing, gear, main, collapsed, undercarriage 1 0.00000 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 2))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
2 Lockheed C-130H Hercules sea, missing, water, hit, hangar 1 100 32 32
knitr::kable(iran_casn_topic_air %>% filter(topic == 3))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
3 Antonov 74T-200 runway, feet, short, left, rest 1 97.36842 38 37
knitr::kable(iran_casn_topic_air %>% filter(topic == 4))

topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities —— ——— —— ———- ————– —————- —————–

knitr::kable(iran_casn_topic_air %>% filter(topic == 5))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
5 Ilyushin 76MD engine, fuel, wing, left, takeoff 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 6))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
6 Lockheed L-1329-25 JetStar II pilot, crew, test, cabin, system 1 100 12 12
6 Antonov 74T-200 pilot, crew, test, cabin, system 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 7))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
7 Airbus A300B2-203 hijacker, demanded, hijackers, passengers, 1 4 58.467742 496 290
7 Boeing 727 hijacker, demanded, hijackers, passengers, 1 6 1.582278 316 5
7 NA hijacker, demanded, hijackers, passengers, 1 5 0.000000 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 8))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
8 Tupolev 154M runway, captain, feet, approach, pilot 1 100 119 119
knitr::kable(iran_casn_topic_air %>% filter(topic == 9))

topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities —— ——— —— ———- ————– —————- —————–

knitr::kable(iran_casn_topic_air %>% filter(topic == 10))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
10 Douglas C-47A-25-DK (DC-3) crashed, mountain, 2, km, antonov 2 100 8 8
10 Lockheed C-130E Hercules crashed, mountain, 2, km, antonov 2 0 0 0
10 Dassault Falcon 20E crashed, mountain, 2, km, antonov 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 11))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
11 Boeing 727-86 accident, destroyed, killed, airport, crew 1 100 128 128
11 Douglas C-47A-35-DL (DC-3) accident, destroyed, killed, airport, crew 1 96 25 24
11 Lockheed C-130H Hercules accident, destroyed, killed, airport, crew 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 12))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
12 Douglas C-47B-20-DK (DC-3) fire, caught, ground, destroyed, de 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 13))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
13 Boeing 747-131F feet, departed, crew, reported, cleared 1 100 17 17
13 Lockheed C-130 Hercules feet, departed, crew, reported, cleared 1 100 7 7
knitr::kable(iran_casn_topic_air %>% filter(topic == 14))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
14 Ilyushin 76MD Simorgh engine, takeoff, landing, forced, lost 1 100 7 7
14 Dassault Falcon 20E engine, takeoff, landing, forced, lost 1 0 0 0
14 Douglas C-47A-20-DL (DC-3) engine, takeoff, landing, forced, lost 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 15))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
15 Dassault Falcon 20F damaged, repair, accident, reportedly, raf 2 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 16))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
16 Boeing 727-286 airport, boeing, international, air, otter 1 57.14286 7 4
knitr::kable(iran_casn_topic_air %>% filter(topic == 17))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
17 Fokker F-28 Fellowship 1000 dc, 3, struck, 4, douglas 1 100 66 66
knitr::kable(iran_casn_topic_air %>% filter(topic == 18))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
18 Tupolev Tu-154M en, route, night, operation, NA 1 100 131 131
18 Douglas C-47 (DC-3) en, route, night, operation, NA 1 0 0 0
18 Lockheed P-3F Orion en, route, night, operation, NA 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 19))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
19 Lockheed C-130 Hercules air, force, transport, base, flying 2 0 0 0
19 Airbus A300B2-203 air, force, transport, base, flying 1 0 0 0
19 Lockheed C-130E Hercules air, force, transport, base, flying 1 0 0 0
knitr::kable(iran_casn_topic_air %>% filter(topic == 20))
topic Aircraft Cause occurance Fatality_rate Total_occupants Total_fatalities
20 Lockheed C-130 Hercules approach, weather, pilot, conditions, visibility 1 100 103 103
20 de Havilland Canada DHC-6 Twin Otter 300 approach, weather, pilot, conditions, visibility 1 100 3 3
20 Lockheed C-130H Hercules approach, weather, pilot, conditions, visibility 1 0 0 0